In [1]:
import pandas as pd
In [3]:
df = pd.read_csv('apps.csv')
In [5]:
print(df.head())
Unnamed: 0 App \
0 0 Photo Editor & Candy Camera & Grid & ScrapBook
1 1 Coloring book moana
2 2 U Launcher Lite – FREE Live Cool Themes, Hide ...
3 3 Sketch - Draw & Paint
4 4 Pixel Draw - Number Art Coloring Book
Category Rating Reviews Size Installs Type Price \
0 ART_AND_DESIGN 4.1 159 19.0 10,000+ Free 0
1 ART_AND_DESIGN 3.9 967 14.0 500,000+ Free 0
2 ART_AND_DESIGN 4.7 87510 8.7 5,000,000+ Free 0
3 ART_AND_DESIGN 4.5 215644 25.0 50,000,000+ Free 0
4 ART_AND_DESIGN 4.3 967 2.8 100,000+ Free 0
Content Rating Genres Last Updated \
0 Everyone Art & Design January 7, 2018
1 Everyone Art & Design;Pretend Play January 15, 2018
2 Everyone Art & Design August 1, 2018
3 Teen Art & Design June 8, 2018
4 Everyone Art & Design;Creativity June 20, 2018
Current Ver Android Ver
0 1.0.0 4.0.3 and up
1 2.0.0 4.0.3 and up
2 1.2.4 4.0.3 and up
3 Varies with device 4.2 and up
4 1.1 4.4 and up
In [17]:
# DATA PREPARATION
print( df.isnull().sum())
Unnamed: 0 0 App 0 Category 0 Rating 0 Reviews 0 Size 0 Installs 0 Type 0 Price 0 Content Rating 0 Genres 0 Last Updated 0 Current Ver 0 Android Ver 0 dtype: int64
In [13]:
df = df.dropna()
In [15]:
print( df.isnull().sum())
Unnamed: 0 0 App 0 Category 0 Rating 0 Reviews 0 Size 0 Installs 0 Type 0 Price 0 Content Rating 0 Genres 0 Last Updated 0 Current Ver 0 Android Ver 0 dtype: int64
In [19]:
basic_stats = df.describe()
print(basic_stats)
Unnamed: 0 Rating Reviews Size count 7021.000000 7021.000000 7.021000e+03 7021.000000 mean 5638.433984 4.160704 1.448960e+05 21.767597 std 3079.108366 0.559241 1.024428e+06 22.731237 min 0.000000 1.000000 1.000000e+00 0.000000 25% 3087.000000 4.000000 8.400000e+01 4.900000 50% 5716.000000 4.300000 1.546000e+03 13.000000 75% 8292.000000 4.500000 2.658700e+04 31.000000 max 10840.000000 5.000000 4.489172e+07 100.000000
In [25]:
print(df.dtypes)
Unnamed: 0 int64 App object Category object Rating float64 Reviews int64 Size float64 Installs object Type object Price object Content Rating object Genres object Last Updated object Current Ver object Android Ver object dtype: object
In [27]:
# Category Exploration
In [31]:
unique_categories = df['Category'].unique()
print(unique_categories)
['ART_AND_DESIGN' 'AUTO_AND_VEHICLES' 'BEAUTY' 'BOOKS_AND_REFERENCE' 'BUSINESS' 'COMICS' 'COMMUNICATION' 'DATING' 'EDUCATION' 'ENTERTAINMENT' 'EVENTS' 'FINANCE' 'FOOD_AND_DRINK' 'HEALTH_AND_FITNESS' 'HOUSE_AND_HOME' 'LIBRARIES_AND_DEMO' 'LIFESTYLE' 'GAME' 'FAMILY' 'MEDICAL' 'SOCIAL' 'SHOPPING' 'PHOTOGRAPHY' 'SPORTS' 'TRAVEL_AND_LOCAL' 'TOOLS' 'PERSONALIZATION' 'PRODUCTIVITY' 'PARENTING' 'WEATHER' 'VIDEO_PLAYERS' 'NEWS_AND_MAGAZINES' 'MAPS_AND_NAVIGATION']
In [33]:
category_counts = df['Category'].value_counts()
print(category_counts)
Category FAMILY 1511 GAME 832 TOOLS 625 PERSONALIZATION 274 LIFESTYLE 269 MEDICAL 266 FINANCE 258 PRODUCTIVITY 223 BUSINESS 222 SPORTS 221 PHOTOGRAPHY 204 HEALTH_AND_FITNESS 191 COMMUNICATION 188 SOCIAL 156 NEWS_AND_MAGAZINES 154 SHOPPING 146 TRAVEL_AND_LOCAL 141 BOOKS_AND_REFERENCE 141 DATING 122 VIDEO_PLAYERS 112 MAPS_AND_NAVIGATION 94 EDUCATION 88 FOOD_AND_DRINK 72 ENTERTAINMENT 64 AUTO_AND_VEHICLES 63 LIBRARIES_AND_DEMO 60 ART_AND_DESIGN 58 HOUSE_AND_HOME 50 WEATHER 50 COMICS 47 PARENTING 44 EVENTS 38 BEAUTY 37 Name: count, dtype: int64
In [35]:
import matplotlib.pyplot as plt
category_counts.plot(kind='bar', figsize=(10, 6))
plt.title('App Distribution Across Categories')
plt.xlabel('Category')
plt.ylabel('Number of Apps')
plt.show()
In [37]:
# Metric Analysis
In [39]:
ratings_summary = df['Rating'].describe()
print(ratings_summary)
count 7021.000000 mean 4.160704 std 0.559241 min 1.000000 25% 4.000000 50% 4.300000 75% 4.500000 max 5.000000 Name: Rating, dtype: float64
In [41]:
size_summary = df['Size'].describe()
print(size_summary)
count 7021.000000 mean 21.767597 std 22.731237 min 0.000000 25% 4.900000 50% 13.000000 75% 31.000000 max 100.000000 Name: Size, dtype: float64
In [43]:
popularity_summary = df['Installs'].describe()
print(popularity_summary)
count 7021 unique 19 top 1,000,000+ freq 1174 Name: Installs, dtype: object
In [45]:
pricing_summary = df['Price'].describe()
print(pricing_summary)
count 7021 unique 68 top 0 freq 6482 Name: Price, dtype: object
In [53]:
# Histogram for app ratings
df['Rating'].hist(bins=20, edgecolor='black')
plt.title('Distribution of App Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()
# Box plot for app sizes
df['Size'].plot(kind='box')
plt.title('Distribution of App Sizes')
plt.ylabel('Size (MB)')
plt.show()
# Histogram for app popularity
df['Installs'].hist(bins=20, edgecolor='black')
plt.title('Distribution of App Popularity')
plt.xlabel('Downloads/User Count')
plt.ylabel('Frequency')
plt.show()
In [55]:
# Sentiment Analysis
In [57]:
df2 = pd.read_csv('user_reviews.csv')
In [61]:
print(df2.head())
App Translated_Review \ 0 10 Best Foods for You I like eat delicious food. That's I'm cooking ... 1 10 Best Foods for You This help eating healthy exercise regular basis 2 10 Best Foods for You NaN 3 10 Best Foods for You Works great especially going grocery store 4 10 Best Foods for You Best idea us Sentiment Sentiment_Polarity Sentiment_Subjectivity 0 Positive 1.00 0.533333 1 Positive 0.25 0.288462 2 NaN NaN NaN 3 Positive 0.40 0.875000 4 Positive 1.00 0.300000
In [65]:
import re
In [75]:
def clean_text(text):
if isinstance(text, str):
# Remove non-alphabetic characters
text = re.sub(r'[^A-Za-z\s]', '', text)
# Convert to lowercase
text = text.lower()
else:
text = ''
return text
In [71]:
print(df2.columns)
Index(['App', 'Translated_Review', 'Sentiment', 'Sentiment_Polarity',
'Sentiment_Subjectivity'],
dtype='object')
In [77]:
df2['cleaned_reviews'] = df2['Translated_Review'].apply(clean_text)
In [79]:
print(df2[['Translated_Review', 'cleaned_reviews']].head())
Translated_Review \
0 I like eat delicious food. That's I'm cooking ...
1 This help eating healthy exercise regular basis
2 NaN
3 Works great especially going grocery store
4 Best idea us
cleaned_reviews
0 i like eat delicious food thats im cooking foo...
1 this help eating healthy exercise regular basis
2
3 works great especially going grocery store
4 best idea us
In [81]:
# Interactive visualizations
In [87]:
import plotly.express as px
In [89]:
# Example: App distribution across categories
fig = px.bar(df, x='Category', y='Installs', title='App Distribution Across Categories')
fig.show()
In [95]:
# Example: App ratings vs. size
fig = px.scatter(df, x='Rating', y='Size', title='App Ratings vs. Size', hover_data=['App'])
fig.show()
In [97]:
# Example: Distribution of app ratings
fig = px.histogram(df, x='Rating', title='Distribution of App Ratings')
fig.show()
In [99]:
# Example: Customizing the bar chart
fig = px.bar(df, x='Category', y='Installs', title='App Distribution Across Categories',
labels={'Category': 'Category', 'Installs': 'Number of Apps'},
color='Category', barmode='group')
fig.update_layout(xaxis_title='Category', yaxis_title='Number of Apps')
fig.show()
In [ ]: